7  scTCRseq: Mapping clones from scTCRseq to bulkTCRseq

7.1 Set up workspace

# Libraries
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(Seurat)
Loading required package: SeuratObject
Loading required package: sp

Attaching package: 'SeuratObject'

The following objects are masked from 'package:base':

    intersect, t
library(dplyr)
library(scRepertoire)

7.2 Load tumor data with clones loaded with a custom scRepertoire variable: vjaa (V gene, J gene, CDR3 aa)

tcrs.list <- readRDS("sctcr_scRep_combined_TCR_skin_tumor_Part1.Rds")

7.3 Concatenate TCRs into one dataframe

tcrs <- do.call(rbind, tcrs.list)

7.4 Create a dataframe of clones per patient, where each row is a chain from a clone.

Tumor

# Splitting TRA and TRB
sctils_clone_chains <- tcrs %>%
  filter(Site == "Tumor") %>%
  group_by(Patient) %>%
  dplyr::count(vjaa, Patient, Timepoint) %>%
  arrange(desc(n)) %>%
  dplyr::rename("tumor_vjaa_counts" = "n") %>%
  mutate(bulk_chain = strsplit(as.character(vjaa), "_")) %>%
  unnest(bulk_chain) %>%
  filter(bulk_chain != "NA;NA") %>%
  pivot_wider(id_cols = c(Patient, vjaa, bulk_chain), names_from = Timepoint, values_from = tumor_vjaa_counts) %>%
  replace(is.na(.), 0)

# Get clones with just 1 TRA or TRB chain
sctils_chain_one <- sctils_clone_chains %>%
  filter(str_count(bulk_chain, ";") == 1)

# Unnest clones with more than 1 TRA or TRB chain
sctils_chain_multi <- sctils_clone_chains %>%
  filter(str_count(bulk_chain, ";") > 1) %>%
  # split vjaa into 4 columns
  mutate(vj1 = str_split_i(bulk_chain, ";", 1),
         vj2 = str_split_i(bulk_chain, ";", 2),
         aa1 = str_split_i(bulk_chain, ";", 3),
         aa2 = str_split_i(bulk_chain, ";", 4),
         vjaa1 = paste0(vj1, ";", aa1),
         vjaa2 = paste0(vj2, ";", aa2)) %>%
  select(Patient, vjaa1, vjaa2, vjaa, W00, W12, W20, PD) %>%
  pivot_longer(cols = c("vjaa1", "vjaa2"), names_to = "chain", values_to = "bulk_chain") %>%
  select(-chain)

sctils_clone_chains <- rbind(sctils_chain_one, sctils_chain_multi)

# Add vjaa counts across tumor
sctils_clone_chains <- sctils_clone_chains %>%
  mutate(tumor_vjaa_counts = W20 + W12 + W00 + PD)

# Check that all clones are unique across patients
sctils_clone_chains %>% distinct(vjaa, Patient, .keep_all = TRUE) %>% filter(duplicated(vjaa))
# A tibble: 0 × 8
# Groups:   Patient [0]
# ℹ 8 variables: Patient <chr>, vjaa <chr>, bulk_chain <chr>, W20 <int>,
#   W12 <int>, W00 <int>, PD <int>, tumor_vjaa_counts <int>

Skin

# Splitting TRA and TRB from one clone into separate rows
scskin_clone_chains <- tcrs %>%
  filter(Site == "Skin") %>%
  group_by(Patient) %>%
  dplyr::count(vjaa, Timepoint, Patient) %>%
  arrange(desc(n)) %>%
  dplyr::rename("skin_vjaa_counts" = "n") %>%
  mutate(bulk_chain = strsplit(as.character(vjaa), "_")) %>%
  unnest(bulk_chain) %>%
  filter(bulk_chain != "NA;NA") %>%
  pivot_wider(id_cols = c(Patient, vjaa, bulk_chain), names_from = Timepoint, values_from = skin_vjaa_counts) %>%
  replace(is.na(.), 0)

# Get clones with just 1 TRA or TRB chain
scskin_chain_one <- scskin_clone_chains %>%
  filter(str_count(bulk_chain, ";") == 1)

# Unnest clones with more than 1 TRA or TRB chain
scskin_chain_multi <- scskin_clone_chains %>%
  filter(str_count(bulk_chain, ";") > 1) %>%
  # split vjaa into 4 columns
  mutate(vj1 = str_split_i(bulk_chain, ";", 1),
         vj2 = str_split_i(bulk_chain, ";", 2),
         aa1 = str_split_i(bulk_chain, ";", 3),
         aa2 = str_split_i(bulk_chain, ";", 4),
         vjaa1 = paste0(vj1, ";", aa1),
         vjaa2 = paste0(vj2, ";", aa2)) %>%
  select(Patient, vjaa1, vjaa2, vjaa, Pre3rd, Post3rd) %>%
  pivot_longer(cols = c("vjaa1", "vjaa2"), names_to = "chain", values_to = "bulk_chain") %>%
  select(-chain)

scskin_clone_chains <- rbind(scskin_chain_one, scskin_chain_multi)

# Add vjaa counts across skin
scskin_clone_chains <- scskin_clone_chains %>%
  mutate(skin_vjaa_counts = Post3rd + Pre3rd)

# All clones are unique across patients
scskin_clone_chains %>% distinct(vjaa, Patient, .keep_all = TRUE) %>% filter(duplicated(vjaa))
# A tibble: 0 × 6
# Groups:   Patient [0]
# ℹ 6 variables: Patient <chr>, vjaa <chr>, bulk_chain <chr>, Post3rd <int>,
#   Pre3rd <int>, skin_vjaa_counts <int>

7.5 Merge clones into one dataframe and count clone frequency over both tumor and skin

sc_clone_chains <- sctils_clone_chains %>%
  full_join(scskin_clone_chains, by = c("Patient", "vjaa", "bulk_chain")) %>%
  replace(is.na(.), 0) %>%
  mutate(total_vjaa_counts = tumor_vjaa_counts + skin_vjaa_counts) %>%
  arrange(desc(total_vjaa_counts))

7.6 Create clone IDs based on decreasing clone frequency in both skin and tumor

sc_clone_ids <- sc_clone_chains %>%
  group_by(Patient) %>%
  distinct(Patient, vjaa) %>%
  mutate(obs = 1:n(),
         sc_clone_id = paste0(Patient, "_", obs)) %>%
  select(-obs) 

7.7 Add clone IDs to full dataframe

sc_clone_chains <- sc_clone_chains %>%
  left_join(sc_clone_ids, by = c("Patient", "vjaa"))

7.8 Reformat chains to match bulk format

sc_clone_chains <- sc_clone_chains %>% 
  # Reformat certain V genes to match bulk (bulk can't differentiate btwn certain V genes)
  mutate(bulk_chain = str_replace_all(bulk_chain, "TRBV12-3|TRBV12-4", "TRBV12-3/4"),
         bulk_chain = str_replace_all(bulk_chain, "TRBV6-2|TRBV6-3|TRBV6-5|TRBV6-6", "TRBV6-2/3/5/6"),
         bulk_chain = str_replace_all(bulk_chain, "TRAV29/DV5", "TRAV29DV5"),
         bulk_chain = str_replace_all(bulk_chain, "TRAV14/DV4", "TRAV14DV4"),
         bulk_chain = str_replace_all(bulk_chain, "TRAV23/DV6", "TRAV23DV6"),
         bulk_chain = str_replace_all(bulk_chain, "TRAV36/DV7", "TRAV36DV7"),
         bulk_chain = str_replace_all(bulk_chain, "TRAV38-2/DV8", "TRAV38-2DV8")) %>%
  # Reformat the . separator to ; as per bulk
  mutate(bulk_chain = str_replace_all(bulk_chain, fixed("."), ";"),)

7.9 Save clone mapping

write.csv(sc_clone_chains, "scTCR_bulkTCR_chain_mapping_Part4.csv", row.names = FALSE)

7.10 Get session info

sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Rocky Linux 8.10 (Green Obsidian)

Matrix products: default
BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.15.so;  LAPACK version 3.9.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: America/New_York
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] scRepertoire_2.0.0 Seurat_5.1.0       SeuratObject_5.0.2 sp_2.2-0          
 [5] lubridate_1.9.4    forcats_1.0.0      stringr_1.5.1      dplyr_1.1.4       
 [9] purrr_1.0.4        readr_2.1.5        tidyr_1.3.1        tibble_3.2.1      
[13] ggplot2_3.5.1      tidyverse_2.0.0   

loaded via a namespace (and not attached):
  [1] cubature_2.1.1              RcppAnnoy_0.0.22           
  [3] splines_4.3.2               later_1.4.1                
  [5] bitops_1.0-9                polyclip_1.10-7            
  [7] fastDummies_1.7.5           lifecycle_1.0.4            
  [9] globals_0.16.3              lattice_0.22-7             
 [11] MASS_7.3-60.0.1             magrittr_2.0.3             
 [13] plotly_4.10.4               rmarkdown_2.29             
 [15] httpuv_1.6.15               sctransform_0.4.1          
 [17] spam_2.11-1                 spatstat.sparse_3.1-0      
 [19] reticulate_1.42.0           cowplot_1.1.3              
 [21] pbapply_1.7-2               RColorBrewer_1.1-3         
 [23] abind_1.4-8                 zlibbioc_1.48.2            
 [25] Rtsne_0.17                  GenomicRanges_1.54.1       
 [27] ggraph_2.2.1                BiocGenerics_0.48.1        
 [29] RCurl_1.98-1.17             tweenr_2.0.3               
 [31] evmix_2.12                  GenomeInfoDbData_1.2.11    
 [33] IRanges_2.36.0              S4Vectors_0.40.2           
 [35] ggrepel_0.9.5               irlba_2.3.5.1              
 [37] listenv_0.9.1               spatstat.utils_3.1-0       
 [39] iNEXT_3.0.1                 MatrixModels_0.5-3         
 [41] goftest_1.2-3               RSpectra_0.16-2            
 [43] spatstat.random_3.3-1       fitdistrplus_1.2-2         
 [45] parallelly_1.41.0           leiden_0.4.3.1             
 [47] codetools_0.2-20            DelayedArray_0.28.0        
 [49] ggforce_0.4.2               tidyselect_1.2.1           
 [51] farver_2.1.2                viridis_0.6.5              
 [53] matrixStats_1.5.0           stats4_4.3.2               
 [55] spatstat.explore_3.3-2      jsonlite_1.8.9             
 [57] tidygraph_1.3.1             progressr_0.15.1           
 [59] ggridges_0.5.6              ggalluvial_0.12.5          
 [61] survival_3.8-3              tools_4.3.2                
 [63] stringdist_0.9.12           ica_1.0-3                  
 [65] Rcpp_1.0.14                 glue_1.8.0                 
 [67] gridExtra_2.3               SparseArray_1.2.4          
 [69] xfun_0.50                   MatrixGenerics_1.14.0      
 [71] GenomeInfoDb_1.38.8         withr_3.0.2                
 [73] fastmap_1.2.0               SparseM_1.84-2             
 [75] digest_0.6.37               timechange_0.3.0           
 [77] R6_2.6.1                    mime_0.13                  
 [79] colorspace_2.1-1            scattermore_1.2            
 [81] tensor_1.5                  spatstat.data_3.1-2        
 [83] generics_0.1.3              data.table_1.15.4          
 [85] graphlayouts_1.1.1          httr_1.4.7                 
 [87] htmlwidgets_1.6.4           S4Arrays_1.2.1             
 [89] uwot_0.2.3                  pkgconfig_2.0.3            
 [91] gtable_0.3.6                lmtest_0.9-40              
 [93] SingleCellExperiment_1.24.0 XVector_0.42.0             
 [95] htmltools_0.5.8.1           dotCall64_1.2              
 [97] scales_1.3.0                Biobase_2.62.0             
 [99] png_0.1-8                   spatstat.univar_3.0-0      
[101] ggdendro_0.2.0              knitr_1.49                 
[103] rstudioapi_0.17.1           rjson_0.2.23               
[105] tzdb_0.5.0                  reshape2_1.4.4             
[107] nlme_3.1-168                zoo_1.8-13                 
[109] cachem_1.1.0                KernSmooth_2.23-26         
[111] parallel_4.3.2              miniUI_0.1.1.1             
[113] pillar_1.10.1               grid_4.3.2                 
[115] vctrs_0.6.5                 RANN_2.6.2                 
[117] VGAM_1.1-13                 promises_1.3.2             
[119] xtable_1.8-4                cluster_2.1.8.1            
[121] evaluate_1.0.1              truncdist_1.0-2            
[123] cli_3.6.3                   compiler_4.3.2             
[125] rlang_1.1.5                 crayon_1.5.3               
[127] future.apply_1.11.3         plyr_1.8.9                 
[129] stringi_1.8.4               viridisLite_0.4.2          
[131] deldir_2.0-4                munsell_0.5.1              
[133] gsl_2.1-8                   lazyeval_0.2.2             
[135] spatstat.geom_3.3-2         quantreg_6.1               
[137] Matrix_1.6-5                RcppHNSW_0.6.0             
[139] hms_1.1.3                   patchwork_1.3.0            
[141] future_1.34.0               shiny_1.9.1                
[143] SummarizedExperiment_1.32.0 evd_2.3-7.1                
[145] ROCR_1.0-11                 igraph_2.0.3               
[147] memoise_2.0.1